Question 1

Part A

In a small size school, outliers have a bigger influence on mean. One really high or low score can drag down the average.

Part B

Model \[y_{ij}\sim N(\theta_i,\sigma^2),\] \[\theta_i \sim N(\mu,\tau^2\sigma^2)\] Full likelihood \[\begin{align*} L &=\prod_{i,j}^{n,n_j}p(y_{ij}|\theta_i,\sigma^2)\prod_i p(\theta_i|\mu,\tau^2,\sigma^2)p(\mu)p(\tau^2)p(\sigma^2) \\ &=\prod_{i,j}^{n,n_j}\frac{1}{\sqrt{2\pi\sigma^2}}e^{-\frac{(y_{ij}-\theta_i)^2}{2\sigma^2}}\prod_i \frac{1}{\sqrt(2\pi\tau^2 \sigma^2)}e^{-\frac{(\theta_i-\mu)^2}{2\tau^2\sigma^2}}\frac{1}{2\sigma_0}e^{-\frac{(\mu-\mu_0)^2}{2\sigma_0^2}}(\tau^2)^{-a-1}e^{-b\frac{1}{\tau^2}}(\sigma^2)^{c-1}e^{-c\frac{1}{\sigma^2}} \end{align*}\]

Hence the full conditionals are \[\theta_i \sim N(\frac{\frac{\mu}{\tau^2\sigma^2}+\frac{n_i\bar{y_i}}{\sigma^2}}{\frac{1}{\tau^2\sigma^2}+\frac{n_i}{\sigma^2}},\frac{1}{\frac{1}{\tau^2\sigma^2}+\frac{n_i}{\sigma^2}})\] \[\mu \sim N(\frac{\frac{\mu_0}{\sigma_0^2}+\frac{n\bar{\theta_i}}{\tau^2\sigma^2}}{\frac{1}{\sigma_0^2}+\frac{n}{\tau^2\sigma^2}},\frac{1}{\frac{1}{\sigma_0^2}+\frac{n}{\tau^2\sigma^2}})\] \[\tau^2\sim IG (a+\frac{n}{2},b+\frac{1}{2}\frac{\sum_i(\theta_i-\mu)^2}{2\sigma^2})\] \[\sigma^2 \sim IG (c+\frac{N}{2}+\frac{n}{2},d+\frac{1}{2}\sum_i\sum_j(y_{ij}-\theta_i)^2+\frac{1}{2}\frac{\sum_i(\theta_i-\mu)^2}{\tau^2})\]

Or if we use \(\eta=\frac{1}{\tau^2}\), \(\lambda=\frac{1}{\sigma^2}\) which are both Gamma, we would reach the same result.

rm(list=ls())
#library(mosaic)    # nice for its overloading of the mean() function
library(invgamma)
mathtest = read.csv('~/Desktop/StatModeling2/mathtest.csv')

#create hierarchical y
factor_school = factor(mathtest$school)
y=list(0)
for(i in 1:nlevels(factor_school)){
  y[[i]]=mathtest$mathscore[mathtest$school==as.numeric(levels(factor_school))[i]]
}


#schoolsizes
ni = as.numeric(summary(factor_school))
#total size
N=sum(ni)
#number of schools
n=nlevels(factor_school)

#place holders
iter=5000
theta <- matrix(0,nrow = n, ncol = (iter+1))
mu <- NULL
t2 <- NULL
sigma2 <- NULL

#Set hyperparameters
a=1
b=1
c=1
d=1
mu0=50
sig0=1

#Initialization
t2[1]=rinvgamma(1,a,b)
sigma2[1]=rinvgamma(1,c,d)
mu[1]=rnorm(1,50,sqrt(sig0))
theta[,1]=rnorm(n,mu[1],sqrt(sigma2[1]))

#Gibbs Sampling
for(i in 1:iter){
  t2[i+1]=rinvgamma(1,a+n/2,b+1/2*sum((theta[,i]-mu[i])^2)/sigma2[i])
  s <- NULL
  for(j in 1:n){
    s[j]=sum((y[[j]]-theta[j,i])^2)
    theta[j,i+1]=rnorm(1,mean=(mu[i]/(t2[i]*sigma2[i])+sum(y[[j]])/sigma2[i])/((1/(t2[i]*sigma2[i])+ni[j]/sigma2[i])),sd=sqrt(1/(1/(t2[i]*sigma2[i])+ni[j]/sigma2[i])))
  }
  sigma2[i+1]=rinvgamma(1,c+N/2+n/2,d+1/2*sum(s)+1/2*sum((theta[,i]-mu[i])^2)/t2[i])
  mu[i+1]=rnorm(1,mean=(mu0/(sig0)+sum(theta[,i])/(t2[i]*sigma2[i]))/(1/sig0+n/(t2[i]*sigma2[i])),sd=sqrt(1/(1/sig0+n/(t2[i]*sigma2[i]))))
}


#plot the traceplots for parameters
plot(sqrt(sigma2[1000:iter]),type='l')

plot(sqrt(t2[1000:iter]),type='l')

plot(mu[1000:iter],type='l')

Part C

k <- NULL
for (j in 1:n){
  k[j] = (mean(y[[j]])-mean(theta[j,]))/mean(y[[j]])
}
plot(ni,k)

Smaller sample size schools mean gets shrunk more, same as we anticipated in part 1.

Question 2

My model is Model \[\log Q_{ij}=a_i+\beta_{i1}\log(P_{ij})+\beta_{i2}D_{ij}+\beta_{i3}(\log P_{ij}*D_{ij})+\epsilon_{ij}\] \[\epsilon_{ij}\sim N(0,\sigma^2)\] \[a_i\sim N(v_1,\phi_2)\] \[\beta_{i1}\sim N(v_2,\sigma_0^2),\beta_{i2}\sim N(v_3,\sigma_0^2),\beta_{i3}\sim N(v_4,\sigma_0^2)\] \[(Its\quad corresponding x_{ij}=(\log P_{ij},D_{ij}))\] \[\lambda=\frac{1}{\sigma^2}\sim Ga(1,1)\]

Full Likelihood \[\begin{align*} L &= \prod_{i,j} p(\log Q_{ij}|a_i,\beta_i,\lambda)\prod_i^n p(a_i)\prod_i^n p(\beta_i) p(\lambda)\\ &= \prod_{i,j} \frac{\lambda}{\sqrt{2\pi}}e^{-\frac{\lambda}{2}(\log Q_{ij}-a_i-x_{ij})^2}\prod_i^n (e^{-\frac{(a_i-v_1)^2}{2\sigma_0^2}} e^{-\frac{(\beta_{i1}-v_2)^2}{2\sigma_0^2}} e^{-\frac{(\beta_{i2}-v_2)^2}{2\sigma_0^2}} e^{-\frac{(\beta_{i3}-v_2)^2}{2\sigma_0^2}}) \lambda^{a-1}e^{-b\lambda} \end{align*}\]

\[\lambda|\cdot \sim Ga(a+N/2,b+\frac{1}{2}\sum_i\sum_j(Q_{ij}-a_i-x_{ij})^2)\] \[a_i |\cdot \sim N(\frac{\frac{v_1}{\sigma_0^2}+\lambda\sum_j(\log Q_{ij}-\beta_{i1}x_{ij,1}-\beta_{i2}x_{ij,2}-\beta_{i3}x_{ij,3})}{\frac{1}{\sigma_0^2}+n_i\lambda},\frac{1}{\frac{1}{\sigma_0^2}+n_i\lambda})\] \[\beta_{i1} |\cdot \sim N(\frac{\frac{v_2}{\sigma_0^2}+\lambda\sum_j((\log Q_{ij}-a_{i}-\beta_{i2}x_{ij,2}-\beta_{i3}x_{ij,3})x_{ij,1})}{\frac{1}{\sigma_0^2}+n_i\lambda},\frac{1}{\frac{1}{\sigma_0^2}+n_i\lambda})\] \[\beta_{i2} |\cdot \sim N(\frac{\frac{v_3}{\sigma_0^2}+\lambda\sum_j((\log Q_{ij}-a_{i}-\beta_{i1}x_{ij,1}-\beta_{i3}x_{ij,3})x_{ij,2})}{\frac{1}{\sigma_0^2}+n_i\lambda},\frac{1}{\frac{1}{\sigma_0^2}+n_i\lambda})\] \[\beta_{i3} |\cdot \sim N(\frac{\frac{v_4}{\sigma_0^2}+\lambda\sum_j((\log Q_{ij}-a_{i}-\beta_{i1}x_{ij,1}-\beta_{i2}x_{ij,2})x_{ij,3})}{\frac{1}{\sigma_0^2}+n_i\lambda},\frac{1}{\frac{1}{\sigma_0^2}+n_i\lambda})\]

rm(list=ls())

#read in data
data=read.csv('~/Desktop/StatModeling2/cheese.csv')
factor_store=as.factor(data$store)
n=nlevels(factor_store)

#separate each store with a list
dt=list(0)
y=list(0)
x=list(0)
ni <- NULL
for(i in 1:nlevels(factor_store)){
  dt[[i]]=data[data$store==as.character(levels(factor_store))[i],]
  y[[i]]=log(dt[[i]]$vol)
  x[[i]]=cbind(log(dt[[i]]$price),dt[[i]]$disp,log(dt[[i]]$price)*dt[[i]]$disp)
  ni[i]=dim(dt[[i]])[1]
}
N=sum(ni)

#create place holders
iter=5000
lambda <- NULL
a <- matrix(0,n,iter)
beta=array(0,c(3,n,iter))

g=1
h=1
sig0=1
lambda[1]=rgamma(1,g,h)

#fix hyperparameter at reasonable value after a lm step for all stores
v1=10
v2=-1
v3=1
v4=-1
a[,1]=rnorm(n,v1,sqrt(sig0))
beta[1,,1]=rnorm(n,mean=v2,sd =sqrt(sig0))
beta[2,,1]=rnorm(n,mean=v3,sd =sqrt(sig0))
beta[3,,1]=rnorm(n,mean=v4,sd =sqrt(sig0))

#Gibbs sampling
#Iteratively update each parameter according to the full conditionals
for(i in 1:(iter-1)){
  s <- NULL
  for(j in 1:n){
    a[j,i+1]=rnorm(1,mean=(v1/sig0+lambda[i]*sum(y[[j]]-beta[1,j,i]*x[[j]][,1]-beta[2,j,i]*x[[j]][,2]-beta[3,j,i]*x[[j]][,3]))/(1/sig0+ni[j]*lambda[i]),sd=sqrt(1/(1/sig0+ni[j]*lambda[i])))
    beta[1,j,i+1]=rnorm(1,mean=(v2/sig0+lambda[i]*sum((y[[j]]-a[j,i]-beta[2,j,i]*x[[j]][,2]-beta[3,j,i]*x[[j]][,3])*x[[j]][,1]))/(1/sig0+ni[j]*lambda[i]),sd=sqrt(1/(1/sig0+ni[j]*lambda[i])))
    beta[2,j,i+1]=rnorm(1,mean=(v3/sig0+lambda[i]*sum((y[[j]]-a[j,i]-beta[1,j,i]*x[[j]][,1]-beta[3,j,i]*x[[j]][,3])*x[[j]][,2]))/(1/sig0+ni[j]*lambda[i]),sd=sqrt(1/(1/sig0+ni[j]*lambda[i])))
    beta[3,j,i+1]=rnorm(1,mean=(v4/sig0+lambda[i]*sum((y[[j]]-a[j,i]-beta[1,j,i]*x[[j]][,1]-beta[2,j,i]*x[[j]][,2])*x[[j]][,3]))/(1/sig0+ni[j]*lambda[i]),sd=sqrt(1/(1/sig0+ni[j]*lambda[i])))
    s[j]=sum((y[[j]]-a[j,i+1]-beta[1,j,i+1]*x[[j]][,1]-beta[2,j,i+1]*x[[j]][,2]-beta[3,j,i+1]*x[[j]][,3])^2)
  }
  lambda[i+1]=rgamma(1,g+N/2,h+1/2*sum(s))
}
betafinal=apply(beta[,,3000:iter],c(1,2),mean)

#for store 1 
par(mfrow=c(2,2))
hist(a[1,500:iter],main='posterior Intercept for store 1')
hist(beta[1,1,500:iter],main='posterior beta_1 for store 1')
hist(beta[2,1,500:iter],main='posterior beta_2 for store 1')
hist(beta[3,1,500:iter],main='posterior beta_3 for store 1')

#for all stores 
par(mfrow=c(2,2))
hist(rowMeans(a),main='posterior Intercept for all stores')
hist(betafinal[1,],main='posterior beta_1 for all stores')
hist(betafinal[2,],main='posterior beta_2 for all stores')
hist(betafinal[3,],main='posterior beta_3 for all stores')

Now we look at the slopes for each store,
In every store, If \(Display=0\), \(\log Q_{ij}=a_i+\beta_{i1}\log P_{ij}\)
If \(Display=1\), \(\log Q_{ij}=a_i+\beta_{i2}+(\beta_{i1}+\beta_{i3})\log P_{ij}\)
We iteratively plot out each store, blue for display=0, red for display=1,

par(mfrow=c(2,2))
for (j in 1:88){
  D=x[[j]][,2]
  plot(x[[j]][D==0,1],y[[j]][D==0],pch=19,col='blue',xlab='logP',ylab='logQ',xlim=c(0.6,1.4),ylim=c(6,10))
  points(x[[j]][D==1,1],y[[j]][D==1],pch=19,col='red')
  seq1=seq(0.6,1.4,by=0.01)
  par(new=TRUE)
  plot(seq1,mean(a[j,])+mean(beta[2,j,])+(mean(beta[1,j,])+mean(beta[3,j,]))*seq1,type='l',col='red',xlab='',ylab='',xlim=c(0.6,1.4),ylim=c(6,10))
  lines(seq1,mean(a[j,])+(mean(beta[1,j,]))*seq1,col='blue',type='l',ylab='')
}

Note for hyperparameters I guessed through a lm step first. I could also use a hierarchical for the hyperparameters too. But since the first level already gives us pretty good result, I used the first method.